Loading libraries

library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:formattable':
## 
##     style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
pinky <- read_csv("pinkyblastp_top50.txt", 
                  col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle")) 
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 8025 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pinky
## # A tibble: 8,025 × 7
##    qseqid                        sseqid    evalue staxids sscin…¹ sskin…² stitle
##    <chr>                         <chr>      <dbl> <chr>   <chr>   <chr>   <chr> 
##  1 NODE_795_length_49550_cov_5.… ref|Y… 0         1932881 Pacman… Viruses Hypot…
##  2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
##  3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
##  4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
##  5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
##  6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
##  7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
##  8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
##  9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 8,015 more rows, and abbreviated variable names ¹​sscinames,
## #   ²​sskingdoms

Filtering e-values with hits lower than 1e-10

pinky_evalue <- pinky %>% 
  filter(as.numeric(evalue) <= 1e-10)
pinky_evalue
## # A tibble: 3,923 × 7
##    qseqid                        sseqid    evalue staxids sscin…¹ sskin…² stitle
##    <chr>                         <chr>      <dbl> <chr>   <chr>   <chr>   <chr> 
##  1 NODE_795_length_49550_cov_5.… ref|Y… 0         1932881 Pacman… Viruses Hypot…
##  2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
##  3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
##  4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
##  5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
##  6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
##  7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
##  8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
##  9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 3,913 more rows, and abbreviated variable names ¹​sscinames,
## #   ²​sskingdoms
pinky_duplicates <- pinky_evalue %>% 
  group_by(qseqid) %>% 
  distinct(sseqid, .keep_all = TRUE)
print(pinky_duplicates)
## # A tibble: 3,689 × 7
## # Groups:   qseqid [132]
##    qseqid                        sseqid    evalue staxids sscin…¹ sskin…² stitle
##    <chr>                         <chr>      <dbl> <chr>   <chr>   <chr>   <chr> 
##  1 NODE_795_length_49550_cov_5.… ref|Y… 0         1932881 Pacman… Viruses Hypot…
##  2 NODE_795_length_49550_cov_5.… gb|QY… 1.28e-180 2862371 Pacman… Viruses polyp…
##  3 NODE_795_length_49550_cov_5.… gb|QJ… 3.69e- 98 1477405 Fausto… Viruses polyp…
##  4 NODE_795_length_49550_cov_5.… emb|S… 3.87e- 98 1477405 Fausto… Viruses Hypot…
##  5 NODE_795_length_49550_cov_5.… gb|QJ… 4.14e- 98 1477405 Fausto… Viruses hypot…
##  6 NODE_795_length_49550_cov_5.… gb|QJ… 4.86e- 98 1477405 Fausto… Viruses hypot…
##  7 NODE_795_length_49550_cov_5.… gb|QJ… 7.88e- 97 1477405 Fausto… Viruses polyp…
##  8 NODE_795_length_49550_cov_5.… gb|QJ… 8.18e- 97 1477405 Fausto… Viruses polyp…
##  9 NODE_795_length_49550_cov_5.… emb|S… 1.22e- 94 1973452 Fausto… Viruses 220 k…
## 10 NODE_795_length_49550_cov_5.… gb|AM… 4.27e- 93 1477405 Fausto… Viruses hypot…
## # … with 3,679 more rows, and abbreviated variable names ¹​sscinames,
## #   ²​sskingdoms

Renaming qseqids the product names produced on Prokka and hallmark proteins

library(dplyr)
rename_values <- c(
  "NODE_2150_length_28922_cov_4.964458_8" = "A32", 
"NODE_5244_length_16314_cov_5.483917_1" = "D5", 
"NODE_22952_length_5851_cov_5.322809_7" = "D5",
"NODE_795_length_49550_cov_5.527831_39" = "mRNAc",
"NODE_2253_length_28109_cov_5.502032_7" = "PolB",
"NODE_795_length_49550_cov_5.527831_38" = "RNAPL",
"NODE_1151_length_41258_cov_5.297770_55" = "RNAPS", 
"NODE_4708_length_17479_cov_5.389635_11" = "RNR",
"NODE_936_length_45786_cov_4.756839_10" = "SFII",
"NODE_2150_length_28922_cov_4.964458_11" = "VLTF3", 
"NODE_795_length_49550_cov_5.527831_2" = "Polyprotein pp62", 
"NODE_795_length_49550_cov_5.527831_20" = "DNA Ligase", 
"NODE_795_length_49550_cov_5.527831_49" = "mRNA-decapping protein g5R",
"NODE_936_length_45786_cov_4.756839_8" = "putative AP endonuclease",
"NODE_936_length_45786_cov_4.756839_17" = "Thymidylate synthase", 
"NODE_1151_length_41258_cov_5.297770_33" = "Deoxyuridine 5'-triphosphate nucleotidohydrolase"
)
# Replacing values in the 'qseqid' column in 'data_evalue' dataframe
pinky_duplicates$qseqid <- ifelse(pinky_duplicates$qseqid %in% names(rename_values), rename_values[pinky_duplicates$qseqid], pinky_duplicates$qseqid)
pinky_kingdom <- pinky_evalue %>% 
  count(sskingdoms)
pinky_kingdom
## # A tibble: 6 × 2
##   sskingdoms           n
##   <chr>            <int>
## 1 Archaea            161
## 2 Archaea;Bacteria     1
## 3 Bacteria           948
## 4 Eukaryota          639
## 5 N/A                  4
## 6 Viruses           2170
piechart_pinky <- ggplot(pinky_kingdom, aes(x="", y=n, fill=sskingdoms)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "PiYG") +
   theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank())
piechart_pinky

pinky_eukaryotes <- pinky_duplicates %>%
  filter(sskingdoms == "Eukaryota") %>%
  group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
pinky_eukaryotes
## # A tibble: 618 × 8
##    qseqid                  sseqid    evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>                   <chr>      <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 A32                     dbj|G… 1.64e- 49 1093978 Elysia… Eukary… BA71V… Elys…
##  2 D5                      dbj|G… 1.18e-165 1093978 Elysia… Eukary… BA71V… Elys…
##  3 DNA Ligase              dbj|G… 1.17e- 42 1093978 Elysia… Eukary… DNA l… Elys…
##  4 DNA Ligase              emb|C… 1.14e- 22 27381   Funnel… Eukary… 9786_… Funn…
##  5 DNA Ligase              gb|KA… 2.33e- 25 82926   Globis… Eukary… hypot… Glob…
##  6 DNA Ligase              gb|KA… 2.97e- 24 29920   Phytop… Eukary… hypot… Phyt…
##  7 DNA Ligase              gb|KA… 3.4 e- 31 164328  Phytop… Eukary… DNA l… Phyt…
##  8 DNA Ligase              gb|KA… 4.01e- 24 164328  Phytop… Eukary… DNA l… Phyt…
##  9 Deoxyuridine 5'-tripho… gb|EM… 2.98e- 24 885315  Entamo… Eukary… deoxy… Enta…
## 10 Deoxyuridine 5'-tripho… gb|KA… 2.28e- 25 117179  Tillet… Eukary… hypot… Till…
## # … with 608 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
pinky_egenus_counts <- pinky_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(pinky_egenus_counts)
## # A tibble: 98 × 3
## # Groups:   qseqid, genus [98]
##    qseqid                                           genus         count
##    <chr>                                            <chr>         <int>
##  1 DNA Ligase                                       Phytophthora      3
##  2 Deoxyuridine 5'-triphosphate nucleotidohydrolase Tilletia          5
##  3 NODE_1151_length_41258_cov_5.297770_24           Macrostomum      12
##  4 NODE_1507_length_35821_cov_5.845160_24           Entrophospora     2
##  5 NODE_1507_length_35821_cov_5.845160_24           Rhizophagus      45
##  6 NODE_15262_length_7722_cov_5.326203_6            Perkinsus         3
##  7 NODE_2150_length_28922_cov_4.964458_23           Phytophthora      5
##  8 NODE_2253_length_28109_cov_5.502032_1            Elysia            2
##  9 NODE_2253_length_28109_cov_5.502032_10           Tetrabaena        2
## 10 NODE_2253_length_28109_cov_5.502032_3            Trametes          2
## # … with 88 more rows
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue",  "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
pinkyvirus_eukaryotes_ggplot <- ggplot(pinky_egenus_counts, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = distinct_colors) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
pinkyvirus_eukaryotes_ggplot

ggplotly(pinkyvirus_eukaryotes_ggplot)
eukaryotes_pinkyggplot <- ggplot(pinky_egenus_counts, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  theme(text = element_text(size = 12), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
eukaryotes_pinkyggplot

Rhizophagus - Homology due to Horizontal Gene Transfer?

rhizophagus_pinky <- pinky_eukaryotes %>% 
  filter(genus == "Rhizophagus")
rhizophagus_pinky
## # A tibble: 92 × 8
##    qseqid                   sseqid   evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>                    <chr>     <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 NODE_1507_length_35821_… dbj|G… 2.34e-53 747089  Rhizop… Eukary… poxvi… Rhiz…
##  2 NODE_1507_length_35821_… dbj|G… 1.73e-54 747089  Rhizop… Eukary… poxvi… Rhiz…
##  3 NODE_1507_length_35821_… dbj|G… 6.29e-52 747089  Rhizop… Eukary… poxvi… Rhiz…
##  4 NODE_1507_length_35821_… dbj|G… 1.48e-53 747089  Rhizop… Eukary… poxvi… Rhiz…
##  5 NODE_1507_length_35821_… dbj|G… 4.87e-51 747089  Rhizop… Eukary… poxvi… Rhiz…
##  6 NODE_1507_length_35821_… dbj|G… 2   e-50 747089  Rhizop… Eukary… poxvi… Rhiz…
##  7 NODE_1507_length_35821_… dbj|G… 8.67e-51 747089  Rhizop… Eukary… poxvi… Rhiz…
##  8 NODE_1507_length_35821_… dbj|G… 8.78e-51 747089  Rhizop… Eukary… poxvi… Rhiz…
##  9 NODE_1507_length_35821_… dbj|G… 7.45e-52 747089  Rhizop… Eukary… poxvi… Rhiz…
## 10 NODE_1507_length_35821_… dbj|G… 6.05e-52 747089  Rhizop… Eukary… poxvi… Rhiz…
## # … with 82 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
rhizophagus_pinky$sseqid
##  [1] "dbj|GBC12205.2|"     "dbj|GBC13564.1|"     "dbj|GBC21622.1|"    
##  [4] "dbj|GBC31224.2|"     "dbj|GBC35452.2|"     "dbj|GBC39224.2|"    
##  [7] "dbj|GBC47585.1|"     "dbj|GET50588.1|"     "dbj|GET50592.1|"    
## [10] "dbj|GET50593.1|"     "dbj|GET51853.1|"     "dbj|GET51855.1|"    
## [13] "dbj|GET51856.1|"     "dbj|GET51873.1|"     "dbj|GET51875.1|"    
## [16] "dbj|GET51876.1|"     "dbj|GET52808.1|"     "dbj|GET54086.1|"    
## [19] "dbj|GET54089.1|"     "dbj|GET54174.1|"     "dbj|GET56822.1|"    
## [22] "dbj|GET56825.1|"     "dbj|GET56826.1|"     "dbj|GET57267.1|"    
## [25] "dbj|GET57268.1|"     "dbj|GET57269.1|"     "dbj|GET57381.1|"    
## [28] "dbj|GET57383.1|"     "dbj|GET63053.1|"     "dbj|GET66458.1|"    
## [31] "dbj|GET66459.1|"     "dbj|GET67080.1|"     "dbj|GET67081.1|"    
## [34] "dbj|GET67083.1|"     "dbj|GET67084.1|"     "dbj|GET67086.1|"    
## [37] "dbj|GET67348.1|"     "dbj|GET67349.1|"     "emb|CAB4403144.1|"  
## [40] "emb|CAB4418995.1|"   "emb|CAB4443500.1|"   "emb|CAB4446634.1|"  
## [43] "emb|CAB4479734.1|"   "gb|EXX63233.1|"      "gb|PKY27577.1|"     
## [46] "dbj|GBC36622.2|"     "emb|CAB5185030.1|"   "gb|EXX73414.1|"     
## [49] "gb|PKC09441.1|"      "gb|PKY22961.1|"      "ref|XP_025173077.1|"
## [52] "emb|CAB5357217.1|"   "emb|CAG8693468.1|"   "emb|CAG8734086.1|"  
## [55] "gb|UZO10893.1|"      "ref|XP_025164841.1|" "dbj|GBB89267.1|"    
## [58] "dbj|GES93775.1|"     "dbj|GBB95954.1|"     "dbj|GBC22393.2|"    
## [61] "dbj|GES89815.1|"     "dbj|GES94728.1|"     "dbj|GES96473.1|"    
## [64] "dbj|GES96599.1|"     "emb|CAB4402555.1|"   "emb|CAB4403248.1|"  
## [67] "emb|CAB4474595.1|"   "emb|CAB4476463.1|"   "emb|CAB4480535.1|"  
## [70] "emb|CAB4492557.1|"   "emb|CAB5186435.1|"   "emb|CAB5205225.1|"  
## [73] "emb|CAB5208415.1|"   "emb|CAG8744073.1|"   "gb|PKB96922.1|"     
## [76] "gb|PKC08263.1|"      "gb|PKC54175.1|"      "gb|PKC57870.1|"     
## [79] "gb|PKK58991.1|"      "gb|PKK60440.1|"      "gb|PKK63436.1|"     
## [82] "gb|RGB22550.1|"      "ref|XP_025181784.1|" "emb|CAB5212864.1|"  
## [85] "emb|CAB5383446.1|"   "gb|EXX55117.1|"      "gb|UZO20410.1|"     
## [88] "ref|XP_025171547.1|" "emb|CAB4481493.1|"   "gb|PKC17413.1|"     
## [91] "gb|PKC65499.1|"      "gb|PKY14747.1|"
# Count the number of times each genus appears for a specific protein sequence
rhizophagus_counts_pinky <- rhizophagus_pinky %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(rhizophagus_counts_pinky)
## # A tibble: 8 × 3
## # Groups:   qseqid, genus [8]
##   qseqid                                 genus       count
##   <chr>                                  <chr>       <int>
## 1 NODE_1507_length_35821_cov_5.845160_24 Rhizophagus    45
## 2 NODE_4293_length_18587_cov_5.277736_15 Rhizophagus     6
## 3 NODE_4708_length_17479_cov_5.389635_18 Rhizophagus     5
## 4 NODE_936_length_45786_cov_4.756839_31  Rhizophagus     2
## 5 PolB                                   Rhizophagus    25
## 6 Polyprotein pp62                       Rhizophagus     2
## 7 mRNA-decapping protein g5R             Rhizophagus     3
## 8 mRNAc                                  Rhizophagus     4
dc <- c("#DE77AE")
rhizophagus_ggplot <- ggplot(rhizophagus_counts_pinky, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = dc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
rhizophagus_ggplot

library(rentrez)
# Function to retrieve BioSample accession for a given protein accession
get_biosample_accession <- function(rhizophagusPinky_accessions) {
  entrez_query <- paste0(rhizophagusPinky_accessions, "[ACCN]")
  entrez_result <- entrez_fetch(db = "protein", id = entrez_query, rettype = "gb", retmode = "text")
  biosample_accession <- regmatches(entrez_result, regexpr("BioSample: (\\w+)", entrez_result))
  return(biosample_accession)
}
# Example protein accession numbers
rhizophagusPinky_accessions <- c(
    "dbj|GBC12205.2|", "dbj|GBC13564.1|", "dbj|GBC21622.1|", "dbj|GBC31224.2|", "dbj|GBC35452.2|",
    "dbj|GBC39224.2|", "dbj|GBC47585.1|", "dbj|GET50588.1|", "dbj|GET50592.1|", "dbj|GET50593.1|",
    "dbj|GET51853.1|", "dbj|GET51855.1|", "dbj|GET51856.1|", "dbj|GET51873.1|", "dbj|GET51875.1|",
    "dbj|GET51876.1|", "dbj|GET52808.1|", "dbj|GET54086.1|", "dbj|GET54089.1|", "dbj|GET54174.1|",
    "dbj|GET56822.1|", "dbj|GET56825.1|", "dbj|GET56826.1|", "dbj|GET57267.1|", "dbj|GET57268.1|",
    "dbj|GET57269.1|", "dbj|GET57381.1|", "dbj|GET57383.1|", "dbj|GET63053.1|", "dbj|GET66458.1|",
    "dbj|GET66459.1|", "dbj|GET67080.1|", "dbj|GET67081.1|", "dbj|GET67083.1|", "dbj|GET67084.1|",
    "dbj|GET67086.1|", "dbj|GET67348.1|", "dbj|GET67349.1|", "emb|CAB4403144.1|", "emb|CAB4418995.1|",
    "emb|CAB4443500.1|", "emb|CAB4446634.1|", "emb|CAB4479734.1|", "gb|EXX63233.1|", "gb|PKY27577.1|",
    "dbj|GBC36622.2|", "emb|CAB5185030.1|", "gb|EXX73414.1|", "gb|PKC09441.1|", "gb|PKY22961.1|",
    "ref|XP_025173077.1|", "emb|CAB5357217.1|", "emb|CAG8693468.1|", "emb|CAG8734086.1|", "gb|UZO10893.1|",
    "ref|XP_025164841.1|", "dbj|GBB89267.1|", "dbj|GES93775.1|", "dbj|GBB95954.1|", "dbj|GBC22393.2|",
    "dbj|GES89815.1|", "dbj|GES94728.1|", "dbj|GES96473.1|", "dbj|GES96599.1|", "emb|CAB4402555.1|",
    "emb|CAB4403248.1|", "emb|CAB4474595.1|", "emb|CAB4476463.1|", "emb|CAB4480535.1|", "emb|CAB4492557.1|",
    "emb|CAB5186435.1|", "emb|CAB5205225.1|", "emb|CAB5208415.1|", "emb|CAG8744073.1|", "gb|PKB96922.1|",
    "gb|PKC08263.1|", "gb|PKC54175.1|", "gb|PKC57870.1|", "gb|PKK58991.1|", "gb|PKK60440.1|", "gb|PKK63436.1|",
    "gb|RGB22550.1|", "ref|XP_025181784.1|", "emb|CAB5212864.1|", "emb|CAB5383446.1|", "gb|EXX55117.1|",
    "gb|UZO20410.1|", "ref|XP_025171547.1|", "emb|CAB4481493.1|", "gb|PKC17413.1|", "gb|PKC65499.1|", "gb|PKY14747.1|"
)


# Retrieve the BioSample accession for the given protein accessions
biosample_accessions <- sapply(rhizophagusPinky_accessions, get_biosample_accession)

# Print the result
print(biosample_accessions)
##           dbj|GBC12205.2|           dbj|GBC13564.1|           dbj|GBC21622.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GBC31224.2|           dbj|GBC35452.2|           dbj|GBC39224.2| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GBC47585.1|           dbj|GET50588.1|           dbj|GET50592.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET50593.1|           dbj|GET51853.1|           dbj|GET51855.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET51856.1|           dbj|GET51873.1|           dbj|GET51875.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET51876.1|           dbj|GET52808.1|           dbj|GET54086.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET54089.1|           dbj|GET54174.1|           dbj|GET56822.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET56825.1|           dbj|GET56826.1|           dbj|GET57267.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET57268.1|           dbj|GET57269.1|           dbj|GET57381.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET57383.1|           dbj|GET63053.1|           dbj|GET66458.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET66459.1|           dbj|GET67080.1|           dbj|GET67081.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET67083.1|           dbj|GET67084.1|           dbj|GET67086.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMD00053945" 
##           dbj|GET67348.1|           dbj|GET67349.1|         emb|CAB4403144.1| 
## "BioSample: SAMD00053945" "BioSample: SAMD00053945" "BioSample: SAMEA5841291" 
##         emb|CAB4418995.1|         emb|CAB4443500.1|         emb|CAB4446634.1| 
## "BioSample: SAMEA5841291" "BioSample: SAMEA5841291" "BioSample: SAMEA5841291" 
##         emb|CAB4479734.1|            gb|EXX63233.1|            gb|PKY27577.1| 
## "BioSample: SAMEA5841244" "BioSample: SAMN02422737" "BioSample: SAMN04195446" 
##           dbj|GBC36622.2|         emb|CAB5185030.1|            gb|EXX73414.1| 
## "BioSample: SAMD00053945" "BioSample: SAMEA5841246" "BioSample: SAMN02422737" 
##            gb|PKC09441.1|            gb|PKY22961.1|       ref|XP_025173077.1| 
## "BioSample: SAMN04195397" "BioSample: SAMN04195446" "BioSample: SAMN02744054" 
##         emb|CAB5357217.1|         emb|CAG8693468.1|         emb|CAG8734086.1| 
## "BioSample: SAMEA5841255" "BioSample: SAMEA8911290" "BioSample: SAMEA8911290" 
##            gb|UZO10893.1|       ref|XP_025164841.1|           dbj|GBB89267.1| 
## "BioSample: SAMN31081226" "BioSample: SAMN02744054" "BioSample: SAMD00098044" 
##           dbj|GES93775.1|           dbj|GBB95954.1|           dbj|GBC22393.2| 
## "BioSample: SAMD00190756" "BioSample: SAMD00098044" "BioSample: SAMD00053945" 
##           dbj|GES89815.1|           dbj|GES94728.1|           dbj|GES96473.1| 
## "BioSample: SAMD00190756" "BioSample: SAMD00190756" "BioSample: SAMD00190756" 
##           dbj|GES96599.1|         emb|CAB4402555.1|         emb|CAB4403248.1| 
## "BioSample: SAMD00190756" "BioSample: SAMEA5841291" "BioSample: SAMEA5841291" 
##         emb|CAB4474595.1|         emb|CAB4476463.1|         emb|CAB4480535.1| 
## "BioSample: SAMEA5841244" "BioSample: SAMEA5841244" "BioSample: SAMEA5841244" 
##         emb|CAB4492557.1|         emb|CAB5186435.1|         emb|CAB5205225.1| 
## "BioSample: SAMEA5841244" "BioSample: SAMEA5841246" "BioSample: SAMEA5841246" 
##         emb|CAB5208415.1|         emb|CAG8744073.1|            gb|PKB96922.1| 
## "BioSample: SAMEA5841246" "BioSample: SAMEA8911290" "BioSample: SAMN04195397" 
##            gb|PKC08263.1|            gb|PKC54175.1|            gb|PKC57870.1| 
## "BioSample: SAMN04195397" "BioSample: SAMN04195188" "BioSample: SAMN04195188" 
##            gb|PKK58991.1|            gb|PKK60440.1|            gb|PKK63436.1| 
## "BioSample: SAMN04195450" "BioSample: SAMN04195450" "BioSample: SAMN04195450" 
##            gb|RGB22550.1|       ref|XP_025181784.1|         emb|CAB5212864.1| 
## "BioSample: SAMN08364867" "BioSample: SAMN02744054" "BioSample: SAMEA5841246" 
##         emb|CAB5383446.1|            gb|EXX55117.1|            gb|UZO20410.1| 
## "BioSample: SAMEA5841255" "BioSample: SAMN02422737" "BioSample: SAMN31081226" 
##       ref|XP_025171547.1|         emb|CAB4481493.1|            gb|PKC17413.1| 
## "BioSample: SAMN02744054" "BioSample: SAMEA5841244" "BioSample: SAMN04195397" 
##            gb|PKC65499.1|            gb|PKY14747.1| 
## "BioSample: SAMN04195188" "BioSample: SAMN04195446"
# Retrieve the BioSample accessions for the sseqid column
rhizophagus_pinky$biosample_accession <- sapply(rhizophagus_pinky$sseqid, get_biosample_accession)

# Print the updated data
print(rhizophagus_pinky)
## # A tibble: 92 × 9
##    qseqid           sseqid   evalue staxids sscin…¹ sskin…² stitle genus biosa…³
##    <chr>            <chr>     <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  
##  1 NODE_1507_lengt… dbj|G… 2.34e-53 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  2 NODE_1507_lengt… dbj|G… 1.73e-54 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  3 NODE_1507_lengt… dbj|G… 6.29e-52 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  4 NODE_1507_lengt… dbj|G… 1.48e-53 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  5 NODE_1507_lengt… dbj|G… 4.87e-51 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  6 NODE_1507_lengt… dbj|G… 2   e-50 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  7 NODE_1507_lengt… dbj|G… 8.67e-51 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  8 NODE_1507_lengt… dbj|G… 8.78e-51 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  9 NODE_1507_lengt… dbj|G… 7.45e-52 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
## 10 NODE_1507_lengt… dbj|G… 6.05e-52 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
## # … with 82 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## #   ³​biosample_accession
# Removing repeated Biosample IDs. 
rhizophagus_biosample <- rhizophagus_pinky %>% 
  group_by(qseqid) %>% 
  distinct(biosample_accession, .keep_all = TRUE)
rhizophagus_biosample
## # A tibble: 38 × 9
## # Groups:   qseqid [8]
##    qseqid           sseqid   evalue staxids sscin…¹ sskin…² stitle genus biosa…³
##    <chr>            <chr>     <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  
##  1 NODE_1507_lengt… dbj|G… 2.34e-53 747089  Rhizop… Eukary… poxvi… Rhiz… BioSam…
##  2 NODE_1507_lengt… emb|C… 4.43e-54 588596  Rhizop… Eukary… unnam… Rhiz… BioSam…
##  3 NODE_1507_lengt… emb|C… 1.44e-54 588596  Rhizop… Eukary… unnam… Rhiz… BioSam…
##  4 NODE_1507_lengt… gb|EX… 1.8 e-50 747089… Rhizop… Eukary… hypot… Rhiz… BioSam…
##  5 NODE_1507_lengt… gb|PK… 1.19e-49 588596  Rhizop… Eukary… hypot… Rhiz… BioSam…
##  6 NODE_4293_lengt… dbj|G… 2.31e-22 747089  Rhizop… Eukary… hypot… Rhiz… BioSam…
##  7 NODE_4293_lengt… emb|C… 4.38e-38 588596  Rhizop… Eukary… unnam… Rhiz… BioSam…
##  8 NODE_4293_lengt… gb|EX… 7.89e-30 1432141 Rhizop… Eukary… hypot… Rhiz… BioSam…
##  9 NODE_4293_lengt… gb|PK… 7.6 e-38 588596  Rhizop… Eukary… hypot… Rhiz… BioSam…
## 10 NODE_4293_lengt… gb|PK… 1.12e-37 588596  Rhizop… Eukary… hypot… Rhiz… BioSam…
## # … with 28 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## #   ³​biosample_accession
# Count the number of times each genus appears for a specific protein sequence
rhizophagus_biosample_count <- rhizophagus_biosample %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(rhizophagus_biosample_count)
## # A tibble: 8 × 3
## # Groups:   qseqid, genus [8]
##   qseqid                                 genus       count
##   <chr>                                  <chr>       <int>
## 1 NODE_1507_length_35821_cov_5.845160_24 Rhizophagus     5
## 2 NODE_4293_length_18587_cov_5.277736_15 Rhizophagus     6
## 3 NODE_4708_length_17479_cov_5.389635_18 Rhizophagus     4
## 4 NODE_936_length_45786_cov_4.756839_31  Rhizophagus     2
## 5 PolB                                   Rhizophagus    12
## 6 Polyprotein pp62                       Rhizophagus     2
## 7 mRNA-decapping protein g5R             Rhizophagus     3
## 8 mRNAc                                  Rhizophagus     4
rhizophagus_biosample_ggplot <- ggplot(rhizophagus_biosample_count, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = dc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
rhizophagus_biosample_ggplot

Phytophthora - Homology due to Horizontal Gene Transfer?

phytophthora_pinky <- pinky_eukaryotes %>% 
  filter(genus == "Phytophthora")
phytophthora_pinky
## # A tibble: 57 × 8
##    qseqid                  sseqid    evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>                   <chr>      <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 DNA Ligase              gb|KA… 2.97e- 24 29920   Phytop… Eukary… hypot… Phyt…
##  2 DNA Ligase              gb|KA… 3.4 e- 31 164328  Phytop… Eukary… DNA l… Phyt…
##  3 DNA Ligase              gb|KA… 4.01e- 24 164328  Phytop… Eukary… DNA l… Phyt…
##  4 NODE_10695_length_9982… gb|KA… 1.5 e- 11 109152  Phytop… Eukary… hypot… Phyt…
##  5 NODE_2150_length_28922… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt…
##  6 NODE_2150_length_28922… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt…
##  7 NODE_2150_length_28922… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt…
##  8 NODE_2150_length_28922… gb|KU… 5.03e-115 4790    Phytop… Eukary… Major… Phyt…
##  9 NODE_2150_length_28922… ref|X… 2.31e-114 761204  Phytop… Eukary… hypot… Phyt…
## 10 NODE_4293_length_18587… gb|KA… 1.38e- 27 221518  Phytop… Eukary… hypot… Phyt…
## # … with 47 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
phytophthora_pinkycounts <- phytophthora_pinky %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(phytophthora_pinkycounts)
## # A tibble: 10 × 3
## # Groups:   qseqid, genus [10]
##    qseqid                                 genus        count
##    <chr>                                  <chr>        <int>
##  1 DNA Ligase                             Phytophthora     3
##  2 NODE_2150_length_28922_cov_4.964458_23 Phytophthora     5
##  3 NODE_795_length_49550_cov_5.527831_47  Phytophthora     2
##  4 NODE_936_length_45786_cov_4.756839_11  Phytophthora     5
##  5 NODE_936_length_45786_cov_4.756839_16  Phytophthora    26
##  6 NODE_936_length_45786_cov_4.756839_28  Phytophthora     2
##  7 NODE_936_length_45786_cov_4.756839_44  Phytophthora     3
##  8 RNAPL                                  Phytophthora     3
##  9 RNAPS                                  Phytophthora     2
## 10 VLTF3                                  Phytophthora     2
pc <- c("#7FBC41")
phytophthora_pinky_ggplot <- ggplot(phytophthora_pinkycounts, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = pc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_pinky_ggplot

phytophthora_pinky$sseqid
##  [1] "gb|KAG6947535.1|"    "gb|KAH7468958.1|"    "gb|KAH7499174.1|"   
##  [4] "gb|KAG7394170.1|"    "gb|ETI39689.1|"      "gb|ETO68404.1|"     
##  [7] "gb|ETP37615.1|"      "gb|KUG01557.1|"      "ref|XP_008910449.1|"
## [10] "gb|KAG7375702.1|"    "gb|KAF1783475.1|"    "gb|ETL77478.1|"     
## [13] "gb|ETP35848.1|"      "gb|KAG1705391.1|"    "gb|KAG7388034.1|"   
## [16] "gb|KAG7391651.1|"    "gb|KAH7470505.1|"    "gb|OWZ13752.1|"     
## [19] "dbj|GMF32380.1|"     "gb|ETI47070.1|"      "gb|ETK86998.1|"     
## [22] "gb|ETL40412.1|"      "gb|ETL93566.1|"      "gb|KAE8887753.1|"   
## [25] "gb|KAE8946961.1|"    "gb|KAE8988664.1|"    "gb|KAE8991089.1|"   
## [28] "gb|KAF1781219.1|"    "gb|KAF4045723.1|"    "gb|KAG1686864.1|"   
## [31] "gb|KAG1692341.1|"    "gb|KAG2776868.1|"    "gb|KAG3023315.1|"   
## [34] "gb|KAG3114356.1|"    "gb|KAG6604511.1|"    "gb|KAG6972529.1|"   
## [37] "gb|KAG7391747.1|"    "gb|KAG7400365.1|"    "gb|KUF97494.1|"     
## [40] "gb|OWZ19253.1|"      "gb|POM60565.1|"      "ref|XP_002904741.1|"
## [43] "ref|XP_008905518.1|" "ref|XP_009520697.1|" "gb|KAG2967876.1|"   
## [46] "gb|KAG3088486.1|"    "gb|KAG6961386.1|"    "gb|ETL86515.1|"     
## [49] "gb|ETP37612.1|"      "ref|XP_008910456.1|" "gb|ETL86504.1|"     
## [52] "gb|KAF1785852.1|"    "gb|KAG2759394.1|"    "gb|ETO68411.1|"     
## [55] "gb|ETP37619.1|"      "gb|KAG2980116.1|"    "gb|KAG3045077.1|"
library(rentrez)

# Function to retrieve BioSample accession for a given protein accession
get_biosample_accession_phytophthora <- function(phytophthoraPinky_accessions) {
  if (startsWith(phytophthoraPinky_accessions, "ref")) {
    return(phytophthoraPinky_accessions)  # Return the protein accession directly
  } else {
    entrez_query <- paste0(phytophthoraPinky_accessions, "[ACCN]")
    entrez_result <- entrez_fetch(db = "protein", id = entrez_query, rettype = "gb", retmode = "text")
    biosample_accession_phytophthora <- regmatches(entrez_result, regexpr("BioSample: (\\w+)", entrez_result))
    return(biosample_accession_phytophthora)
}}
# Example protein accession numbers
phytophthoraPinky_accessions <- c(
   "gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",     
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",   
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|",      "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|",    "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|",   "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|"   
)

# Retrieve the BioSample accession for the given protein accessions
biosample_accessions_phytophthora <- sapply(phytophthoraPinky_accessions, get_biosample_accession_phytophthora)

# Print the result
print(biosample_accessions_phytophthora)
##          gb|KAG6947535.1|          gb|KAH7468958.1|          gb|KAH7499174.1| 
## "BioSample: SAMN17151089" "BioSample: SAMN19730089" "BioSample: SAMN19730093" 
##          gb|KAG7394170.1|            gb|ETI39689.1|            gb|ETO68404.1| 
## "BioSample: SAMN17922340" "BioSample: SAMN01816558" "BioSample: SAMN01816559" 
##            gb|ETP37615.1|            gb|KUG01557.1|       ref|XP_008910449.1| 
## "BioSample: SAMN01816557" "BioSample: SAMN04017959"     "ref|XP_008910449.1|" 
##          gb|KAG7375702.1|          gb|KAF1783475.1|            gb|ETL77478.1| 
## "BioSample: SAMN17922341" "BioSample: SAMN04633454" "BioSample: SAMN02178349" 
##            gb|ETP35848.1|          gb|KAG1705391.1|          gb|KAG7388034.1| 
## "BioSample: SAMN01816557" "BioSample: SAMN09692924" "BioSample: SAMN17922340" 
##          gb|KAG7391651.1|          gb|KAH7470505.1|            gb|OWZ13752.1| 
## "BioSample: SAMN17922341" "BioSample: SAMN19730089" "BioSample: SAMN04632436" 
##           dbj|GMF32380.1|            gb|ETI47070.1|            gb|ETK86998.1| 
## "BioSample: SAMD00557650" "BioSample: SAMN01816558" "BioSample: SAMN02178347" 
##            gb|ETL40412.1|            gb|ETL93566.1|          gb|KAE8887753.1| 
## "BioSample: SAMN02178348" "BioSample: SAMN02178349" "BioSample: SAMN07449681" 
##          gb|KAE8946961.1|          gb|KAE8988664.1|          gb|KAE8991089.1| 
## "BioSample: SAMN07449687" "BioSample: SAMN07449691" "BioSample: SAMN07449690" 
##          gb|KAF1781219.1|          gb|KAF4045723.1|          gb|KAG1686864.1| 
## "BioSample: SAMN04633454" "BioSample: SAMN13441166" "BioSample: SAMN09692924" 
##          gb|KAG1692341.1|          gb|KAG2776868.1|          gb|KAG3023315.1| 
## "BioSample: SAMN09692924" "BioSample: SAMN06766401" "BioSample: SAMN07267863" 
##          gb|KAG3114356.1|          gb|KAG6604511.1|          gb|KAG6972529.1| 
## "BioSample: SAMN07267868" "BioSample: SAMN16706909" "BioSample: SAMN17151088" 
##          gb|KAG7391747.1|          gb|KAG7400365.1|            gb|KUF97494.1| 
## "BioSample: SAMN17922341" "BioSample: SAMN17922340" "BioSample: SAMN04017960" 
##            gb|OWZ19253.1|            gb|POM60565.1|       ref|XP_002904741.1| 
## "BioSample: SAMN04632436" "BioSample: SAMN04632378"     "ref|XP_002904741.1|" 
##       ref|XP_008905518.1|       ref|XP_009520697.1|          gb|KAG2967876.1| 
##     "ref|XP_008905518.1|"     "ref|XP_009520697.1|" "BioSample: SAMN07267863" 
##          gb|KAG3088486.1|          gb|KAG6961386.1|            gb|ETL86515.1| 
## "BioSample: SAMN07267868" "BioSample: SAMN17151088" "BioSample: SAMN02178349" 
##            gb|ETP37612.1|       ref|XP_008910456.1|            gb|ETL86504.1| 
## "BioSample: SAMN01816557"     "ref|XP_008910456.1|" "BioSample: SAMN02178349" 
##          gb|KAF1785852.1|          gb|KAG2759394.1|            gb|ETO68411.1| 
## "BioSample: SAMN04633454" "BioSample: SAMN06766401" "BioSample: SAMN01816559" 
##            gb|ETP37619.1|          gb|KAG2980116.1|          gb|KAG3045077.1| 
## "BioSample: SAMN01816557" "BioSample: SAMN07267863" "BioSample: SAMN07267864"
# Retrieve the BioSample accessions for the sseqid column
phytophthora_pinky$biosample_accession_phytophthora <- sapply(phytophthora_pinky$sseqid, get_biosample_accession_phytophthora)

# Print the updated data
print(phytophthora_pinky)
## # A tibble: 57 × 9
##    qseqid          sseqid    evalue staxids sscin…¹ sskin…² stitle genus biosa…³
##    <chr>           <chr>      <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  
##  1 DNA Ligase      gb|KA… 2.97e- 24 29920   Phytop… Eukary… hypot… Phyt… BioSam…
##  2 DNA Ligase      gb|KA… 3.4 e- 31 164328  Phytop… Eukary… DNA l… Phyt… BioSam…
##  3 DNA Ligase      gb|KA… 4.01e- 24 164328  Phytop… Eukary… DNA l… Phyt… BioSam…
##  4 NODE_10695_len… gb|KA… 1.5 e- 11 109152  Phytop… Eukary… hypot… Phyt… BioSam…
##  5 NODE_2150_leng… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt… BioSam…
##  6 NODE_2150_leng… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt… BioSam…
##  7 NODE_2150_leng… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt… BioSam…
##  8 NODE_2150_leng… gb|KU… 5.03e-115 4790    Phytop… Eukary… Major… Phyt… BioSam…
##  9 NODE_2150_leng… ref|X… 2.31e-114 761204  Phytop… Eukary… hypot… Phyt… ref|XP…
## 10 NODE_4293_leng… gb|KA… 1.38e- 27 221518  Phytop… Eukary… hypot… Phyt… BioSam…
## # … with 47 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## #   ³​biosample_accession_phytophthora
# Removing repeated Biosample IDs. 
phytophthora_pinky_biosample <- phytophthora_pinky %>% 
  group_by(qseqid) %>% 
  distinct(biosample_accession_phytophthora, .keep_all = TRUE)
phytophthora_pinky_biosample
## # A tibble: 56 × 9
## # Groups:   qseqid [14]
##    qseqid          sseqid    evalue staxids sscin…¹ sskin…² stitle genus biosa…³
##    <chr>           <chr>      <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  
##  1 DNA Ligase      gb|KA… 2.97e- 24 29920   Phytop… Eukary… hypot… Phyt… BioSam…
##  2 DNA Ligase      gb|KA… 3.4 e- 31 164328  Phytop… Eukary… DNA l… Phyt… BioSam…
##  3 DNA Ligase      gb|KA… 4.01e- 24 164328  Phytop… Eukary… DNA l… Phyt… BioSam…
##  4 NODE_10695_len… gb|KA… 1.5 e- 11 109152  Phytop… Eukary… hypot… Phyt… BioSam…
##  5 NODE_2150_leng… gb|ET… 4.73e-116 4792;1… Phytop… Eukary… hypot… Phyt… BioSam…
##  6 NODE_2150_leng… gb|ET… 1.12e-111 1317066 Phytop… Eukary… hypot… Phyt… BioSam…
##  7 NODE_2150_leng… gb|ET… 2.82e-113 1317064 Phytop… Eukary… hypot… Phyt… BioSam…
##  8 NODE_2150_leng… gb|KU… 5.03e-115 4790    Phytop… Eukary… Major… Phyt… BioSam…
##  9 NODE_2150_leng… ref|X… 2.31e-114 761204  Phytop… Eukary… hypot… Phyt… ref|XP…
## 10 NODE_4293_leng… gb|KA… 1.38e- 27 221518  Phytop… Eukary… hypot… Phyt… BioSam…
## # … with 46 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms,
## #   ³​biosample_accession_phytophthora
# List of protein accession numbers
phytophthora_accessions <- c("gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",     
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",   
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|",      "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|",    "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|",   "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|")

# Retrieve protein sequences using NCBI Entrez

# Write protein sequences to a FASTA file
writeLines(phytophthora_accessions, "protein_sequences.faa")
library(rentrez)

# List of protein accession numbers
protein_accessions <- c("gb|KAG6947535.1|", "gb|KAH7468958.1|", "gb|KAH7499174.1|", "gb|KAG7394170.1|", "gb|ETI39689.1|",     
"gb|ETO68404.1|", "gb|ETP37615.1|", "gb|KUG01557.1|", "ref|XP_008910449.1|", "gb|KAG7375702.1|", "gb|KAF1783475.1|", "gb|ETL77478.1|", "gb|ETP35848.1|", "gb|KAG1705391.1|", "gb|KAG7388034.1|", "gb|KAG7391651.1|", "gb|KAH7470505.1|", "gb|OWZ13752.1|", "dbj|GMF32380.1|", "gb|ETI47070.1|", "gb|ETK86998.1|", "gb|ETL40412.1|", "gb|ETL93566.1|", "gb|KAE8887753.1|", "gb|KAE8946961.1|", "gb|KAE8988664.1|", "gb|KAE8991089.1|", "gb|KAF1781219.1|", "gb|KAF4045723.1|", "gb|KAG1686864.1|", "gb|KAG1692341.1|", "gb|KAG2776868.1|", "gb|KAG3023315.1|", "gb|KAG3114356.1|", "gb|KAG6604511.1|",   
"gb|KAG6972529.1|", "gb|KAG7391747.1|", "gb|KAG7400365.1|", "gb|KUF97494.1|", "gb|OWZ19253.1|", "gb|POM60565.1|",      "ref|XP_002904741.1|", "ref|XP_008905518.1|", "ref|XP_009520697.1|", "gb|KAG2967876.1|", "gb|KAG3088486.1|", "gb|KAG6961386.1|",    "gb|ETL86515.1|", "gb|ETP37612.1|", "ref|XP_008910456.1|", "gb|ETL86504.1|", "gb|KAF1785852.1|", "gb|KAG2759394.1|", "gb|ETO68411.1|",   "gb|ETP37619.1|", "gb|KAG2980116.1|", "gb|KAG3045077.1|")

# Retrieve protein sequences using NCBI Entrez
protein_sequences <- entrez_fetch(
  db = "protein",
  id = protein_accessions,
  rettype = "fasta_cds_aa",  # Use "fasta_cds_aa" to get protein sequences
  retmode = "text"
)

# Write protein sequences to a .faa file
output_file <- "protein_sequences.faa"
writeLines(protein_sequences, output_file)

# Print a message indicating successful writing
cat("Protein sequences have been written to", output_file, "\n")
## Protein sequences have been written to protein_sequences.faa
# Count the number of times each genus appears for a specific protein sequence
phytophthora_biosample_count <- phytophthora_pinky_biosample %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep")  
print(phytophthora_biosample_count)
## # A tibble: 14 × 3
## # Groups:   qseqid, genus [14]
##    qseqid                                 genus        count
##    <chr>                                  <chr>        <int>
##  1 DNA Ligase                             Phytophthora     3
##  2 NODE_10695_length_9982_cov_5.336658_8  Phytophthora     1
##  3 NODE_2150_length_28922_cov_4.964458_23 Phytophthora     5
##  4 NODE_4293_length_18587_cov_5.277736_18 Phytophthora     1
##  5 NODE_5244_length_16314_cov_5.483917_3  Phytophthora     1
##  6 NODE_795_length_49550_cov_5.527831_47  Phytophthora     2
##  7 NODE_936_length_45786_cov_4.756839_11  Phytophthora     5
##  8 NODE_936_length_45786_cov_4.756839_16  Phytophthora    25
##  9 NODE_936_length_45786_cov_4.756839_28  Phytophthora     2
## 10 NODE_936_length_45786_cov_4.756839_36  Phytophthora     1
## 11 NODE_936_length_45786_cov_4.756839_44  Phytophthora     3
## 12 RNAPL                                  Phytophthora     3
## 13 RNAPS                                  Phytophthora     2
## 14 VLTF3                                  Phytophthora     2
phytophthora_biosample_ggplot <- ggplot(phytophthora_biosample_count, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = pc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_biosample_ggplot

# Count the number of times each genus appears for a specific protein sequence
phytophthora_biosample_repeats <- phytophthora_pinky_biosample %>%
  group_by(biosample_accession_phytophthora) %>%
  summarise(count = n(), .groups = "keep")  
print(phytophthora_biosample_repeats)
## # A tibble: 34 × 2
## # Groups:   biosample_accession_phytophthora [34]
##    biosample_accession_phytophthora count
##    <chr>                            <int>
##  1 BioSample: SAMD00557650              1
##  2 BioSample: SAMN01816557              4
##  3 BioSample: SAMN01816558              2
##  4 BioSample: SAMN01816559              2
##  5 BioSample: SAMN02178347              1
##  6 BioSample: SAMN02178348              1
##  7 BioSample: SAMN02178349              4
##  8 BioSample: SAMN04017959              1
##  9 BioSample: SAMN04017960              1
## 10 BioSample: SAMN04632378              1
## # … with 24 more rows
phytophthora_repeated <- ggplot(phytophthora_biosample_repeats, aes(x = biosample_accession_phytophthora, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = pc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
phytophthora_repeated

Bacteria HGT

pinky_bacteria <- pinky_duplicates %>% 
  filter(sskingdoms == "Bacteria") %>% 
   group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
pinky_bacteria
## # A tibble: 922 × 8
##    qseqid sseqid                 evalue staxids sscinames   sskin…¹ stitle genus
##    <chr>  <chr>                   <dbl> <chr>   <chr>       <chr>   <chr>  <chr>
##  1 A32    gb|MAR56914.1|      9.91e- 16 2026788 Rickettsia… Bacter… hypot… Rick…
##  2 A32    gb|MCA9330243.1|    3.23e- 15 2026720 Candidatus… Bacter… hypot… Cand…
##  3 A32    gb|MCE7958853.1|    7.27e- 50 2293626 Acidobacte… Bacter… hypot… Acid…
##  4 A32    gb|MCK9607861.1|    1.86e- 30 418     Methylomon… Bacter… hypot… Meth…
##  5 A32    gb|MDE2097674.1|    2.09e- 48 2052139 Patescibac… Bacter… ATP-b… Pate…
##  6 A32    gb|NBP51235.1|      4.51e- 53 1883427 Actinomyce… Bacter… hypot… Acti…
##  7 A32    gb|NBP85256.1|      4.87e- 30 2448780 Mycobacter… Bacter… hypot… Myco…
##  8 A32    ref|WP_292571174.1| 1.35e- 30 418     Methylomon… Bacter… hypot… Meth…
##  9 D5     gb|MCK9604135.1|    2.11e-137 2035772 Candidatus… Bacter… PriCT… Cand…
## 10 D5     gb|MDE2099315.1|    6.3 e-150 2052139 Patescibac… Bacter… hypot… Pate…
## # … with 912 more rows, and abbreviated variable name ¹​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
pinky_bacteria_counts <- pinky_bacteria %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 2) # Only include those with more than one count 
print(pinky_bacteria_counts)
## # A tibble: 75 × 3
## # Groups:   qseqid, genus [75]
##    qseqid                                           genus               count
##    <chr>                                            <chr>               <int>
##  1 Deoxyuridine 5'-triphosphate nucleotidohydrolase Acidobacteriota         4
##  2 NODE_1151_length_41258_cov_5.297770_57           Alphaproteobacteria     5
##  3 NODE_1151_length_41258_cov_5.297770_57           Bacteroidota            8
##  4 NODE_1151_length_41258_cov_5.297770_57           Chitinophagaceae        5
##  5 NODE_1151_length_41258_cov_5.297770_57           Fluviispira             3
##  6 NODE_1151_length_41258_cov_5.297770_57           Mesorhizobium           3
##  7 NODE_1151_length_41258_cov_5.297770_57           Saprospiraceae          3
##  8 NODE_1151_length_41258_cov_5.297770_57           Silvanigrella           3
##  9 NODE_15262_length_7722_cov_5.326203_6            Deltaproteobacteria     4
## 10 NODE_15262_length_7722_cov_5.326203_6            Lujinxingia             8
## # … with 65 more rows
pinky_bacteria_ggplot <- ggplot(pinky_bacteria_counts, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = pc) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
pinky_bacteria_ggplot